{r} # phase_count_fatal <- df_aviation %>% # filter(Injury.Severity != "Non-Fatal") %>% # count(Broad.Phase.of.Flight) %>% # mutate( # phase = Broad.Phase.of.Flight # ) # # phase_count_non_fatal <- df_aviation %>% # filter(Injury.Severity == "Non-Fatal") %>% # count(Broad.Phase.of.Flight) %>% # mutate( # phase = Broad.Phase.of.Flight # ) # # p<- ggplot() + # geom_point(data = phase_count_non_fatal, stat = "identity", aes(x = reorder(phase, -n), y=n, color = 'red')) + # geom_bar(data = phase_count_fatal, stat = "identity", aes(x= reorder(phase, -n), y = n)) + # theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # p #{r} # fit_date <- # glm( # formula = Fatal ~ Month + Weather.Condition, # data = df_date_polar %>% # filter( # !is.na(Fatal), # Month %in% c(1, 7, 10), # Weather.Condition %in% c(" IMC ", " VMC ", " UNK ") # ) %>% # mutate(Weather.Condition = fct_relevel(Weather.Condition, " UNK ")), # family = "binomial" # ) # # fit_date %>% tidy() #In this project, we focus on studying the air crash investigation dataset from National Transportation Safety Board (NTSB). We want to study the question: How well can flight parameters (engine type/number, weather condition, the broad phase of flight, etc.) be used to predict the fatality of a flight?
The dataset we used contained information on air crashes investigated by the NTSB from 1982 to 2007. The investigation range of NTSB includes all air crashes happened within U.S. territory, or when the involved aircraft are U.S. made, or the aircraft carries passengers/crew members that are U.S. citizens, or otherwise invited by the transportation safety bureau from another country. Commercial airline crashes as well as other aviation crashes are included in this dataset.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.4 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(modelr)
library(broom)
##
## Attaching package: 'broom'
## The following object is masked from 'package:modelr':
##
## bootstrap
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gganimate)
filename <- "./data/AviationData.txt"
df_aviation <- read.delim2(filename, header = TRUE, sep = "|", dec = ".")
df_aviation <- df_aviation %>%
filter(str_detect(Aircraft.Category, "Airplane"), str_detect(Investigation.Type, "Accident")) %>%
mutate(
IsGeneralAviation = str_detect(FAR.Description, "91")
)
df_aviation
df_aviation %>% glimpse()
## Rows: 6,486
## Columns: 33
## $ Event.Id <chr> "20080125X00106 ", "20080114X00045 ", "2008012…
## $ Investigation.Type <chr> " Accident ", " Accident ", " Accident ", " Ac…
## $ Accident.Number <chr> " SEA08CA056 ", " LAX08FA043 ", " CHI08CA057 "…
## $ Event.Date <chr> " 12/31/2007 ", " 12/30/2007 ", " 12/30/2007 "…
## $ Location <chr> " Santa Ana, CA ", " Paso Robles, CA ", " Alex…
## $ Country <chr> " United States ", " United States ", " United…
## $ Latitude <dbl> 33.67556, 35.54222, 45.86611, 38.53389, 46.800…
## $ Longitude <dbl> -117.86806, -120.52278, -95.39444, -106.93306,…
## $ Airport.Code <chr> " SNA ", " PRB ", " AXN ", " ", " ", " ", "…
## $ Airport.Name <chr> " John Wayne - Orange County ", " Paso Robles …
## $ Injury.Severity <chr> " Non-Fatal ", " Fatal(1) ", " Non-Fatal ", " …
## $ Aircraft.Damage <chr> " Substantial ", " Substantial ", " Substantia…
## $ Aircraft.Category <chr> " Airplane ", " Airplane ", " Airplane ", " Ai…
## $ Registration.Number <chr> " N2800D ", " N254SR ", " N5093F ", " N33MF ",…
## $ Make <chr> " Piper ", " Cirrus Design Corp. ", " Lerohl "…
## $ Model <chr> " PA-12 ", " SR22 ", " RV-8 ", " PA-46-310P ",…
## $ Amateur.Built <chr> " No ", " No ", " Yes ", " No ", " No ", " No …
## $ Number.of.Engines <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 2…
## $ Engine.Type <chr> " Reciprocating ", " Reciprocating ", " Recipr…
## $ FAR.Description <chr> " Part 91: General Aviation ", " Part 91: Gene…
## $ Schedule <chr> " ", " ", " ", " ", " ", " ", " ", " "…
## $ Purpose.of.Flight <chr> " Instructional ", " Personal ", " Personal ",…
## $ Air.Carrier <chr> " ", " ", " ", " ", " ", " ", " ", " "…
## $ Total.Fatal.Injuries <dbl> NA, 1, NA, NA, NA, NA, 0, NA, 1, NA, 0, 0, NA,…
## $ Total.Serious.Injuries <dbl> NA, NA, NA, NA, NA, NA, 1, NA, 1, NA, 0, 0, NA…
## $ Total.Minor.Injuries <dbl> NA, NA, NA, NA, NA, NA, 0, NA, NA, NA, 0, 2, N…
## $ Total.Uninjured <dbl> 2, NA, 1, 5, 1, 2, 0, 1, NA, 1, 1, 0, 2, 2, 29…
## $ Weather.Condition <chr> " VMC ", " VMC ", " VMC ", " VMC ", " VMC ", "…
## $ Broad.Phase.of.Flight <chr> " LANDING ", " MANEUVERING ", " TAKEOFF ", " T…
## $ Report.Status <chr> " Probable Cause ", " Probable Cause ", " Prob…
## $ Publication.Date <chr> " 02/28/2008 ", " 06/20/2014 ", " 02/28/2008 "…
## $ X <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
## $ IsGeneralAviation <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…
Observations:
A glimpse at the dataset shows that 6,732 accidents and incidents are recorded in this dataset. 32 columns are presented, in which the total number of fatal, serious, and minor injuries, as well as the total uninjured number are recorded as output variables, and other variables including Make, Model, Number.of.Engines, Engine.Type, Purpose.of.Flight and Broad.Phase.of.Flight are interpreted as input variables in this study.
df_binomial <- df_aviation %>% mutate(Fatal = as.logical(Total.Fatal.Injuries))
df_binomial %>% count(Fatal)
Observations:
A simple check on the Total.Fatal.Injuries column of the dataset reveals that from 1982 to 2007, there are 2623 non-fatal crashes and 944 fatal crashes, and 3165 flight crash information are not available.
df_aviation_general <- df_aviation %>%
filter (str_detect(FAR.Description, "91"), str_detect(Investigation.Type, "Accident"))
df_aviation_general %>%
count(Total.Fatal.Injuries)
df_aviation %>%
count(Total.Fatal.Injuries)
df_aviation %>%
filter(Total.Fatal.Injuries > 150)
df_aviation_general %>%
filter(Total.Fatal.Injuries < 150) %>%
ggplot() +
geom_boxplot(mapping = aes(x = Broad.Phase.of.Flight, y = Total.Fatal.Injuries )) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
{r} # phase_count_fatal <- df_aviation %>% # filter(Injury.Severity != "Non-Fatal") %>% # count(Broad.Phase.of.Flight) %>% # mutate( # phase = Broad.Phase.of.Flight # ) # # phase_count_non_fatal <- df_aviation %>% # filter(Injury.Severity == "Non-Fatal") %>% # count(Broad.Phase.of.Flight) %>% # mutate( # phase = Broad.Phase.of.Flight # ) # # p<- ggplot() + # geom_point(data = phase_count_non_fatal, stat = "identity", aes(x = reorder(phase, -n), y=n, color = 'red')) + # geom_bar(data = phase_count_fatal, stat = "identity", aes(x= reorder(phase, -n), y = n)) + # theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) # p #Observations:
df_date <- df_aviation %>%
count(Event.Date) %>%
mutate(
Month.Day = substr(Event.Date,2,6)
) %>%
group_by(Month.Day) %>%
mutate(
count = sum(n)
)
glimpse(df_date)
## Rows: 2,031
## Columns: 4
## Groups: Month.Day [366]
## $ Event.Date <chr> " 01/01/1982 ", " 01/01/2006 ", " 01/01/2007 ", " 01/02/19…
## $ n <int> 2, 1, 1, 8, 2, 3, 8, 1, 4, 2, 1, 5, 2, 2, 2, 7, 1, 5, 1, 1…
## $ Month.Day <chr> "01/01", "01/01", "01/01", "01/02", "01/02", "01/02", "01/…
## $ count <int> 4, 4, 4, 13, 13, 13, 16, 16, 16, 16, 16, 11, 11, 11, 11, 8…
df_date$Month.Day <- as.Date(df_date$Month.Day, "%m/%d")
df_date %>%
ggplot() +
geom_point(mapping = aes(x = Month.Day, y = count, fill = )) +
scale_x_date(date_breaks = "1 month", date_minor_breaks = "1 week",
date_labels = "%B") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle("Accidents by Time of Year")
df_date %>%
ggplot() +
geom_point(mapping = aes(x = Month.Day, y = count )) +
scale_x_date(date_breaks = "1 month", date_minor_breaks = "1 week",
date_labels = "%B") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggtitle("Accidents by Time of Year")
Observations:
df_date_polar <-df_aviation %>%
count(Event.Date) %>%
mutate(
Month.Day = substr(Event.Date,2,6),
Year = substr(Event.Date, 8, 11),
Date = paste(Year, substr(Event.Date, 2, 3), substr(Event.Date,5,6), sep = "-", collapse = NULL),
Month = paste("2020", substr(Event.Date, 2, 3), "01", sep = "-", collapse = NULL)
)
df_date_polar$Month.Day = as.Date(df_date_polar$Month.Day, format = "%m/%d")
df_date_polar$Date = as.Date(df_date_polar$Date, format = "%Y-%m-%d")
df_date_polar$Month = as.Date(df_date_polar$Month, format = "%Y-%m-%d")
df_date_polar
# AGGREGATED BY MONTH
#preprocess data
df_month <- df_date_polar %>%
group_by(Month, Year) %>%
mutate(
count = sum(n)
) %>%
ungroup() %>%
arrange(Date) %>%
mutate(
frame = seq.int(nrow(df_date_polar))
)
df_month$smoothed <- predict(loess(n ~ as.numeric(Date), data = df_month), newdata = df_month[, c("Date")])
fig <- df_month %>%
ggplot(aes(x = Month, y = count, group = Year, color = Year)) +
theme_bw() +
theme(
rect = element_blank(),
axis.title = element_blank(),
legend.position = "none"
) +
labs(title = "Monthly Plane Accidents") +
geom_line(aes(group = Year), size = 0.15) +
geom_point(color = "gray20") +
scale_y_continuous() +
scale_x_date(date_breaks = "1 month", date_labels = "%b", expand = expansion(add = c(0, -15))) +
geom_text(aes(x = as.Date("2020-10-01"), y = 300, label = Year), position = position_nudge(y = -0.1, x = 0), size = 10, color = "gray20") +
transition_reveal(along = frame, keep_last = FALSE)
animate(fig, nframes = 200, fps = 10, width=1200,height=800, res = 130,
render = av_renderer("Monthly Airplane Accidents.mp4", codec = "libx264"))
df_binomial %>%
filter(is.na(Fatal) == FALSE) %>%
group_by(Weather.Condition) %>%
count(Fatal) %>%
ggplot(aes(x = Weather.Condition, y = n, fill = Fatal)) +
geom_col(position = "dodge") +
ggtitle("Number of Fatal and Non-fatal Crashes in Different Weather Conditions")
Observations:
VMC, or visual meteorological conditions, means that pilots have sufficient visibility to fly the aircraft maintaining visual separation from terrain and other aircraft. IMC, or instrumental meteorological conditions, means that pilots need to fly primarily by reference to instruments, and therefore under instrument flight rules (IFR). Typically, IMC means flying in cloudy or bad weather, while VMC means fine weather.
Comparing the absolute value of fatal crashes in VMC is higher than IMC. However, this may be due to the fact that much more flights are flown in VMC, as flights are likely to be canceled or diverted in deteriorating weather. Comparing the ratio of fatal and non-fatal crashes in VMC and IMC, it turns out that in IMC the crashes have a higher odds of being fatal.
The contribution of weather condition to a fatal crash will be investigated in later sections by fitting it into a logistic model.
df_binomial %>%
filter(is.na(Fatal) == FALSE) %>%
group_by(Aircraft.Damage) %>%
count(Fatal) %>%
ggplot(aes(x = Aircraft.Damage, y = n, fill = Fatal)) +
geom_col(position = "dodge") +
ggtitle("Number of Fatal and Non-fatal Crashes for different Damage")
Observations:
We can reasonably assume that, when an aircraft is severely damaged or destroyed, there is a higher probability of having fatalities, while for minor and substantial damages the fatality is much lower. This figure above proves that, the destroyed air crashes are much more fatal comparing to other categories.
The Aircraft.Damage criteria will be studied in later sections where a model is fitted to figure out how much the aircraft damage contributes to fatality.
fit_weather <-
glm(
formula = Fatal ~ Weather.Condition,
data = df_binomial %>%
filter(
!is.na(Fatal), Weather.Condition %in% c(" IMC ", " VMC ", " UNK ")
) %>%
mutate(Weather.Condition = fct_relevel(Weather.Condition, " UNK ")),
family = "binomial"
)
fit_weather %>% tidy()
Observations:
fit_damage <-
glm(
formula = Fatal ~ Aircraft.Damage,
data = df_binomial %>%
filter(
!is.na(Fatal), Aircraft.Damage %in% c(" Destroyed ", " Minor ", " Substantial ")
) %>%
mutate(Aircraft.Damage = fct_relevel(Aircraft.Damage, " Substantial ")),
family = "binomial"
)
fit_damage %>% tidy()
Observations:
df_aviation_dates <-df_aviation %>%
mutate(
Month.Day = substr(Event.Date,2,6),
Year = substr(Event.Date, 8, 11),
Date = paste(Year, substr(Event.Date, 2, 3), substr(Event.Date,5,6), sep = "-", collapse = NULL),
Month = paste("2020", substr(Event.Date, 2, 3), "01", sep = "-", collapse = NULL)
)
df_aviation_dates$Month.Day = as.Date(df_aviation_dates$Month.Day, format = "%m/%d")
df_aviation_dates$Date = as.Date(df_aviation_dates$Date, format = "%Y-%m-%d")
df_aviation_dates$Month = as.Date(df_aviation_dates$Month, format = "%Y-%m-%d")
# AGGREGATED BY MONTH
#preprocess data
df_month <- df_aviation_dates %>%
group_by(Month, Year, IsGeneralAviation) %>%
mutate(
count = n(),
average = mean(n())
) %>%
ungroup() %>%
arrange(Date)
df_month$frame <- seq.int(nrow(df_month))
df_month
fig <- df_month %>%
ggplot(aes(color = ifelse(IsGeneralAviation, "General Aviation", "Commercial + Other"))) +
theme_bw() +
labs(title = "Average Monthly Plane Accidents") +
geom_line(data = df_month %>% filter(IsGeneralAviation), aes(x = Month, y = average, group = Year), size = 0.15) +
geom_line(data = df_month %>% filter(!IsGeneralAviation), aes(x = Month, y = average, group = Year), size = 0.15) +
labs(x = "Month",
y = "Accidents",
color = "Flight Type") +
scale_x_date(date_breaks = "1 month", date_labels = "%b", expand = expansion(add = c(0, -15))) +
geom_text(aes(x = as.Date("2020-10-01"), y = 260, label = Year), position = position_nudge(y = -0.1, x = 0), size = 10, color = "gray20") +
transition_reveal(along = frame, keep_last = FALSE)
# animate(fig, nframes = 200, fps = 10, width=1200,height=800, res = 130,
# render = av_renderer("Monthly Airplane Accidents.mp4", codec = "libx264"))
animate(fig, renderer = gifski_renderer(), width=1200, height=800)
{r} # fit_date <- # glm( # formula = Fatal ~ Month + Weather.Condition, # data = df_date_polar %>% # filter( # !is.na(Fatal), # Month %in% c(1, 7, 10), # Weather.Condition %in% c(" IMC ", " VMC ", " UNK ") # ) %>% # mutate(Weather.Condition = fct_relevel(Weather.Condition, " UNK ")), # family = "binomial" # ) # # fit_date %>% tidy() #Observations: